## Annual CF model for all key plants

# Prepare an annual flow time series for all dams using DayFlow
prep_annual_inflow <- function(target_plants_mapped_to_water,
                               gen_aCF_1980_2022,
                               Dayflow_hydro_dams,
                               resOps_releases){

  
  # get historical total annual inflows from DayFlow (1980 - 2019)
  arrow::read_parquet(Dayflow_hydro_dams) ->
    annual_flow_by_COMID
  
  # prepare annual flows for ResOps sites
  resOps_releases |> 
    filter(year(date) <= 2019) |> 
    # convert to BCM
    mutate_if(is.numeric, function(x) x * 24 * 60 * 60 * 1e-9) |>
    mutate_if(is.numeric, function(x) na.approx(x, maxgap = 14, na.rm = FALSE)) |> 
    mutate(year = year(date)) |> 
    select(-date) |> 
    group_by(year) |> 
    summarise_if(is.numeric, sum) |> ungroup() |> 
    select_if(~ !any(is.na(.))) |> 
    pivot_longer(-c(year), names_to = "grand_id",
                 values_to = "flow_BCM") |> 
    mutate(year = as.integer(year), grand_id = as.integer(grand_id)) ->
    resOps_releases_annual
  
  
  target_plants_mapped_to_water |> 
    filter(grand_id %in% resOps_releases_annual[["grand_id"]]) |> 
    select(RHPID, grand_id) |> unique() -> grand_to_RHPID
  
  
  # create flow table to populate
  as_tibble(
    expand.grid(year = 1980:2019,
                RHPID = unique(target_plants_mapped_to_water[["RHPID"]]))
  ) |> 
    left_join(target_plants_mapped_to_water |> 
                  select(RHPID, COMID) |> unique(),
              by = join_by(RHPID)) ->
    flow_template

  # identify plants with associated COMID and annual dayflow data
  flow_template |> filter(!is.na(COMID)) |> 
    left_join(annual_flow_by_COMID,
              by = join_by(year, COMID)) |> 
    mutate(Corrected = if_else(is.infinite(Corrected), Naturalized, Corrected)) |> 
    select(RHPID, year, flow_BCM = Corrected) |> 
    filter(!is.na(flow_BCM)) |> 
    mutate(flow_mean = mean(flow_BCM), .by = RHPID) |> 
    filter(flow_mean != 0) |> select(-flow_mean) |> 
    mutate(flowtype = "Dayflow") ->
    annual_flow_data
    
  # identify plants with missing COMID, requiring a dayflow copy
  flow_template |>
    filter(!RHPID %in% unique(annual_flow_data[["RHPID"]])) |> 
    select(RHPID, year) |> 
    mutate(flowtype = "copy") |> 
    filter(!RHPID %in% grand_to_RHPID[["RHPID"]]) ->
    annual_flow_missing
  
  # distinguish between storage and run-of-river plants
  target_plants_mapped_to_water |> 
    mutate(huc_4 = substr(huc_12, 1, 4),
           huc_2 = substr(huc_12, 1, 2)) |> 
    mutate(dominant_mode = if_else(dominant_mode == "Run-of-river",
                                   "RoR", "Other")) |> 
    replace_na(list(dominant_mode = "Other")) ->
    target_plants_with_modes

  # Euclidean distance function for identifying nearest dam
  euclidean_dist <- function(x1, x2, y1, y2) sqrt((x2-x1)^2 + (y2-y1)^2)
  
  # get vector of plants lacking water information
  annual_flow_missing |> 
    pull(RHPID) |> unique() ->
    missing_plants
  
  # find matching plant
   missing_plants |> 
    map_dfr(function(plant){
      

      target_plants_with_modes |> 
        filter(RHPID == plant) |> 
        select(-EIA_ID, -COMPLXID) |> 
        unique() ->
        plant_loc_and_mode
      
      # make sure no duplicates of RHPID with water detail
      if(nrow(plant_loc_and_mode) > 1) message(plant)
      
      target_plants_with_modes |> 
        filter(!RHPID %in% missing_plants) |> 
        pull(RHPID) |> unique() ->
        plants_with_flow
      
      # identify plants in same HUC4 with same mode
      target_plants_with_modes |> 
        filter(huc_4 == plant_loc_and_mode[["huc_4"]],
               dominant_mode == plant_loc_and_mode[["dominant_mode"]],
               RHPID %in% plants_with_flow) ->
        choices
      
      # identify plants in same HUC2 with same mode
      target_plants_with_modes |> 
        filter(huc_2 == plant_loc_and_mode[["huc_2"]],
               dominant_mode == plant_loc_and_mode[["dominant_mode"]],
               RHPID %in% c(plants_with_flow)) ->
        huc2_choices
      
      # identify plants in same HUC4, any mode
      target_plants_with_modes |> 
        filter(huc_4 == plant_loc_and_mode[["huc_4"]],
               RHPID %in% c(plants_with_flow)) ->
        huc4_choices_no_mode
      
      # identify plants in same HUC2, any mode
      target_plants_with_modes |> 
        filter(huc_2 == plant_loc_and_mode[["huc_2"]],
               RHPID %in% c(plants_with_flow)) ->
        huc2_choices_no_mode
      
      if(nrow(choices) == 0) choices <- huc2_choices
      if(nrow(choices) == 0) choices <- huc4_choices_no_mode
      if(nrow(choices) == 0) choices <- huc2_choices_no_mode
      if(nrow(choices) == 0){
        choices <- filter(target_plants_with_modes,
                          dominant_mode == plant_loc_and_mode[["dominant_mode"]],
                          RHPID %in% plants_with_flow)
      } 
      
      # find closest by distance from viable choices
      choices |> 
        mutate(dist = euclidean_dist(pt_lon, plant_loc_and_mode[["pt_lon"]],
                                     pt_lat, plant_loc_and_mode[["pt_lat"]])) |> 
        arrange(dist) |> 
        summarise(RHPID_match = first(RHPID)) |> 
        mutate(RHPID = plant)
        
    }) ->
    plant_matches
  
   resOps_releases_annual |>
     left_join(grand_to_RHPID, by = "grand_id") |> 
     select(-grand_id) |> 
     mutate(flowtype = "ResOpsUS") ->
     annual_inflow_ResOps
  
   annual_flow_data |> 
     filter(!RHPID %in% grand_to_RHPID[["RHPID"]]) |> 
     bind_rows(annual_inflow_ResOps) ->
     annual_flow_all_RHPID_minus_copies
  
  # fill out plants with missing annual water
  annual_flow_missing |> 
    left_join(plant_matches,
              join_by(RHPID)) |> select(-flowtype) |> 
    left_join(annual_flow_all_RHPID_minus_copies,
              by = c("year", "RHPID_match" = "RHPID")) |> 
    mutate(flowtype = paste0("copy", "-", flowtype, "-", RHPID_match)) |> 
    select(RHPID, year, flow_BCM, flowtype) ->
    annual_flow_copies

  bind_rows(annual_flow_all_RHPID_minus_copies,
            annual_flow_copies) ->
    annual_inflow_complete
  
  # # compare with resOps...
  # resOps_releases_annual |>
  #   left_join(grand_to_RHPID, by = "grand_id") |>
  #   rename(flow_grand = flow_BCM) |>
  #   left_join(annual_inflow_complete) |>
  #   ggplot(aes(year, flow_grand)) + geom_line() +
  #   geom_line(aes(y = flow_BCM), col = "red") +
  #   facet_wrap(~RHPID, scales = "free_y")
   
  return(
    annual_inflow_complete
  )
  
}


prep_annual_energy_CF_water <- function(target_plants_mapped_to_water,
                                        gen_aCF_1980_2022,
                                        annual_flow_all_plants){

  gen_aCF_1980_2022 |> 
    select(EIA_ID, year, gen_MWH = TOTAL, cap_MWh) |> 
    left_join(
      target_plants_mapped_to_water |> 
        select(EIA_ID, RHPID, H_m),
      by = join_by(EIA_ID)
    ) |> 
    summarise(
      gen_MWh = sum(gen_MWH),
      cap_MWh = sum(cap_MWh),
      H_m = mean(H_m),
      .by = c(year, RHPID)
    ) |> 
    mutate(CF = gen_MWh / cap_MWh) |> 
    left_join(annual_flow_all_plants,
              by = join_by(year, RHPID)) |> 
    # upper bound potential energy from the dam
    mutate(PE_MWh = (1000 * 9.81 * H_m * flow_BCM * 1e-9) / (3.6 * 1e-9)) |> 
    # capacity potential
    mutate(CP = PE_MWh / cap_MWh) -> annual_energy_CF_water
    
    # annual_energy_CF_water |>
    #   filter(grepl("HOOVER", RHPID)) |>
    #   filter(year >= 2008) |>
    #   ggplot(aes(CP, CF)) + geom_point() +
    #   ylim(0, 1.2) + xlim(0, 2.5) +
    #   geom_abline(slope = 1) + geom_hline(yintercept = 1)
  
  return(annual_energy_CF_water)
}

fit_annual_CF_models <- function(annual_energy_CF_water){

  annual_energy_CF_water |>
    filter(year %in% 1980:2019) |> 
    select(year, RHPID, CF_act = CF, CP, flow_BCM, flowtype) |> 
    split(~RHPID) |> 
    map_dfr(function(plant_CF_flow){
      
      message(plant_CF_flow$RHPID[1])
      
      plant_CF_flow |> 
        filter(year %in% 2005:2019) ->
        plant_CF_flow_ready
      
      if(all(is.na(plant_CF_flow_ready$CF_act))) return(tibble())
      
      # assess linear approach first
      lm(plant_CF_flow_ready[["CF_act"]] ~ plant_CF_flow_ready[["flow_BCM"]]) ->
        linear_CF_model

      get_TmodX2_rmse <- function(x){
        p1 <- x[1]
        p2 <- x[2]
        plant_CF_flow_ready |>
          mutate(CF_mod = 1 + CP -
                   (p2 + CP ^ p1)^(1/p1)) ->
          pred_added
        pull(pred_added, CF_mod) -> result
        pull(pred_added, CF_act) -> target
        return(
          sqrt(mean((result - target)^2, na.rm = T))
        )
      }
      
      if(all(is.na(plant_CF_flow_ready[["CP"]]))){
        TmodX2 <- list(par = c(NA_real_, NA_real_))
      }else{
        TmodX2 <- optim(par = c(1,1),
                        fn = get_TmodX2_rmse,
                        lower = c(1,1),
                        upper = c(10, 20),
                        method = "L-BFGS-B")
      }
      
      # check
      # plant_CF_flow_ready |> 
      #   mutate(CF_mod = 1 + CP - 
      #            (TmodX2$par[2] +CP ^ TmodX2$par[1])^(1/TmodX2$par[1])) |> 
      #     ggplot(aes(CP, CF_act)) + geom_point() +
      #     ylim(0, 1.2) + xlim(0, 2.5) +
      #     geom_abline(slope = 1) + geom_hline(yintercept = 1) +
      #   geom_point(aes(y = CF_mod), col = "red")
      
      plant_CF_flow |>
        mutate(
          Lmod_p1 = linear_CF_model$coefficients[[1]],
          Lmod_p2 = linear_CF_model$coefficients[[2]],
          Xmod_p1 = TmodX2$par[1],
          Xmod_p2 = TmodX2$par[2],
          CF_Lmod = Lmod_p1 + flow_BCM * Lmod_p2,
          CF_Xmod = 1 + CP -
                 (Xmod_p2 +CP ^ Xmod_p1)^(1/Xmod_p1)
        ) -> modeled_CF
        
      modeled_CF |> 
        mutate(Xmod_error = CF_Xmod - CF_act) |> 
        pull(Xmod_error) |> mean(na.rm = T) |> abs() -> bias
        
      modeled_CF |> 
        mutate(choice = if_else(bias > 0.01 | any(is.na(CF_Xmod)),
                                "L", "X")) ->
        modeled_Cf_with_choice
      
      return(modeled_Cf_with_choice)
      
    }) -> models
  
  return(models)
  
}





  
  